Data Visualization with Python
Kuo, Yao-Jen yaojenkuo@ntu.edu.tw from DATAINPOINT
from datetime import date
from datetime import timedelta
from urllib.error import HTTPError
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import plotly.express as px
Modern data science is a huge field, it involves applications and tools like importing, tidying, transformation, visualization, modeling, and communication. Surrounding all these is programming.

Source: R for Data Science
What data visualization libraries or tools do you use on a regular basis? (Select all that apply)
ks.plot_survey_summary(question_index="Q14", n=3)
https://insights.stackoverflow.com/trends?tags=matplotlib%2Cseaborn%2Cplotly%2Cbokeh
matplotlib?¶Matplotlib, short for matlab plotting library is a comprehensive library for creating static, animated, and interactive visualizations in Python.
Source: https://matplotlib.org/
import command¶Matplotlib is officially aliased as mpl.
import matplotlib as mpl
ModuleNotFoundError¶Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ModuleNotFoundError: No module named 'matplotlib'
pip install at Terminal to install matplotlib¶pip install matplotlib
__version__ attribute__file__ attributeprint(mpl.__version__)
print(mpl.__file__)
3.5.1 /Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/matplotlib/__init__.py
matplotlib.pyplot through this chapter, which is a plotting module in matplotlib, officially aliased as plt¶import matplotlib.pyplot as plt
x1 = np.linspace(0, np.pi*2, 100)
x2 = np.linspace(np.pi * 0.5, np.pi*2.5, 100)
plt.figure()
plt.subplot(2, 1, 1)
plt.plot(x1, np.sin(x1))
plt.subplot(2, 1, 2)
plt.plot(x2, np.cos(x2))
plt.show()
fig, axes = plt.subplots(2, 1)
axes[0].plot(x1, np.sin(x1))
axes[1].plot(x2, np.cos(x2))
plt.show()
x = np.linspace(0, 2*np.pi, 100)
y = np.sin(x)
fig = plt.figure()
ax = plt.axes()
fig, ax = plt.subplots()
fig = plt.figure()
ax = plt.axes()
ax.plot(x, y)
[<matplotlib.lines.Line2D at 0x7fd7b599d940>]
fig = plt.figure()
ax = plt.axes()
ax.plot(x, y)
ax.set_title('A simple plot')
ax.set_xlabel('x')
Text(0.5, 0, 'x')
fig = plt.figure()
ax = plt.axes()
ax.plot(x, y)
ax.set_title('A simple plot')
ax.set_xlabel('x')
plt.show()
Various format is supported.
fig = plt.figure()
fig.canvas.get_supported_filetypes()
{'eps': 'Encapsulated Postscript',
'jpg': 'Joint Photographic Experts Group',
'jpeg': 'Joint Photographic Experts Group',
'pdf': 'Portable Document Format',
'pgf': 'PGF code for LaTeX',
'png': 'Portable Network Graphics',
'ps': 'Postscript',
'raw': 'Raw RGBA bitmap',
'rgba': 'Raw RGBA bitmap',
'svg': 'Scalable Vector Graphics',
'svgz': 'Scalable Vector Graphics',
'tif': 'Tagged Image File Format',
'tiff': 'Tagged Image File Format'}
<Figure size 432x288 with 0 Axes>
fig = plt.figure()
ax = plt.axes()
ax.plot(x, y)
ax.set_title('A simple plot')
ax.set_xlabel('x')
fig.savefig('a_simple_plot.png')
scatter() for scatter plots.hist() for histograms.plot() for lines.bar() for bar plots.imshow() for showing images in ndarray format.def get_latest_daily_report():
today = date.today()
day_delta = timedelta(days=1)
data_date = today
while True:
data_date_str = date.strftime(data_date, '%m-%d-%Y')
print("Try importing {} data...".format(data_date_str))
daily_report_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv".format(data_date_str)
try:
daily_report = pd.read_csv(daily_report_url)
print("Successfully imported {} data!".format(data_date_str))
break
except HTTPError:
data_date -= day_delta
return daily_report
daily_report = get_latest_daily_report()
Try importing 05-11-2022 data... Try importing 05-10-2022 data... Successfully imported 05-10-2022 data!
scatter for scatter plots¶x = daily_report["Incident_Rate"].values
y = daily_report["Case_Fatality_Ratio"].values
fig = plt.figure()
ax = plt.axes()
ax.scatter(x, y, s=1)
plt.show()
hist for histograms¶x = daily_report["Incident_Rate"].values # cases per 100,000 persons
fig = plt.figure()
ax = plt.axes()
ax.hist(x, bins=50)
plt.show()
def get_confirmed_time_series():
request_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
time_series = pd.read_csv(request_url)
id_cols = time_series.columns[:4]
time_series_long = pd.melt(time_series, id_vars=id_cols, var_name='Date', value_name='Confirmed')
date = pd.to_datetime(time_series_long['Date'])
time_series_long = time_series_long.drop('Date', axis=1)
time_series_long.insert(4, 'Date', date)
groupby_series = time_series_long.groupby(['Country/Region', 'Date'])['Confirmed'].sum()
out = pd.DataFrame(groupby_series).reset_index()
return out
confirmed_time_series = get_confirmed_time_series()
plot for lines¶global_ts = confirmed_time_series.groupby("Date")["Confirmed"].sum()
x = global_ts.index
y = global_ts.values
fig = plt.figure()
ax = plt.axes()
ax.plot(x, y)
plt.show()
bar for vertical bar plots¶country_list = ["US", "United Kingdom", "France", "Germany", "Canada", "Korea, South", "Japan", "Singapore", "Australia", "Taiwan*", "New Zealand"]
confirmed_by_country = daily_report.groupby('Country_Region')['Confirmed'].sum().sort_values()
filtered_confirmed_by_country = confirmed_by_country[confirmed_by_country.index.isin(country_list)]
fig = plt.figure()
ax = plt.axes()
x = filtered_confirmed_by_country.index
height = filtered_confirmed_by_country.values
ax.bar(x, height)
plt.show()
barh for horizontal bar plots¶fig = plt.figure()
ax = plt.axes()
y = filtered_confirmed_by_country.index
width = filtered_confirmed_by_country.values
ax.barh(y, width)
plt.show()
set_title to add title.set_xlabel to add x-axis label.set_ylabel to add y-axis label.fig = plt.figure()
ax = plt.axes()
y = filtered_confirmed_by_country.index
width = filtered_confirmed_by_country.values
ax.barh(y, width)
ax.set_title('The Ten Countries with Most Confirmed Cases.')
ax.set_xlabel('Confirmed')
ax.set_ylabel('Country Name')
plt.show()
set_xticks to adjust x-axis ticks.set_yticks to adjust y-axis ticks.fig = plt.figure()
ax = plt.axes()
y = filtered_confirmed_by_country.index
width = filtered_confirmed_by_country.values
ax.barh(y, width)
ax.set_xticks([])
ax.set_yticks([])
plt.show()
/var/folders/0b/r__z5mpn6ldgb_w2j7_y_ntr0000gn/T/ipykernel_30617/2388895212.py:7: MatplotlibDeprecationWarning: Support for passing numbers through unit converters is deprecated since 3.5 and support will be removed two minor releases later; use Axis.convert_units instead. ax.set_yticks([])
set_xticklabels to adjust x-axis tick labels.set_yticklabels to adjust y-axis tick labels.fig = plt.figure()
ax = plt.axes()
y = filtered_confirmed_by_country.index
width = filtered_confirmed_by_country.values
ax.barh(y, width)
ax.set_yticks([7, 8, 9])
ax.set_yticklabels(["BRA", "IND", "USA"])
plt.show()
set_xlim to adjust the upper/lower limits of x-axis.set_ylim to adjust the upper/lower limits of y-axis.fig = plt.figure()
ax = plt.axes()
y = filtered_confirmed_by_country.index
width = filtered_confirmed_by_country.values
ax.barh(y, width)
ax.set_ylim(6.5, 9.5)
plt.show()
text(x, y, 'Some Strings') to add 'Some Strings' at (x, y).
fig = plt.figure()
ax = plt.axes()
ax.barh(filtered_confirmed_by_country.index, filtered_confirmed_by_country.values)
for i in range(filtered_confirmed_by_country.size):
v = filtered_confirmed_by_country.values[i]
ax.text(v + 1000000, i - 0.2, "{:,}".format(v))
ax.set_xlim(0, filtered_confirmed_by_country.max() + 2e7)
plt.show()
legend() and label parameter to add a legend to indicate categories.
def plot_lines_with_legends(country_names, ts_df):
fig = plt.figure()
ax = plt.axes()
for country in country_names:
country_data = ts_df[ts_df['Country/Region'].str.contains(country)]
x = country_data['Date']
y = country_data['Confirmed']
ax.plot(x, y, label=country)
ax.set_xlabel('Date')
ax.set_title('Cumulative COVID-19 confirmed cases')
ax.legend()
plt.show()
plot_lines_with_legends(['Taiwan', 'Singapore', 'New Zealand'], confirmed_time_series)
fig = plt.figure()
ax = plt.axes()
tw = confirmed_time_series[confirmed_time_series['Country/Region'].str.contains('Taiwan')]
ax.plot(tw['Date'].values, tw['Confirmed'].values, label='台灣')
ax.set_title('COVID-19 累計確診人數')
ax.set_xlabel('日期')
ax.legend()
plt.show()
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 32047 (\N{CJK UNIFIED IDEOGRAPH-7D2F}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 35336 (\N{CJK UNIFIED IDEOGRAPH-8A08}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 30906 (\N{CJK UNIFIED IDEOGRAPH-78BA}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 35386 (\N{CJK UNIFIED IDEOGRAPH-8A3A}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 20154 (\N{CJK UNIFIED IDEOGRAPH-4EBA}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 25976 (\N{CJK UNIFIED IDEOGRAPH-6578}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 26085 (\N{CJK UNIFIED IDEOGRAPH-65E5}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 26399 (\N{CJK UNIFIED IDEOGRAPH-671F}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 21488 (\N{CJK UNIFIED IDEOGRAPH-53F0}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
/Users/kuoyaojen/opt/miniconda3/envs/python39/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Glyph 28771 (\N{CJK UNIFIED IDEOGRAPH-7063}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
FontProperties function from matplotlib.font_manager¶from matplotlib.font_manager import FontProperties
my_font = FontProperties(fname="/System/Library/Fonts/STHeiti Light.ttc") # Specify a font supporting Chinese
fig = plt.figure()
ax = plt.axes()
tw = confirmed_time_series[confirmed_time_series['Country/Region'].str.contains('Taiwan')]
ax.plot(tw['Date'].values, tw['Confirmed'].values, label='台灣')
ax.set_title('COVID-19 累計確診人數', fontproperties=my_font)
ax.set_xlabel('日期', fontproperties=my_font)
ax.legend(prop=my_font)
plt.show()
subplots(m, n) to create a ndarray with a shape (m, n).ndarray.[m, n].fig, axes = plt.subplots(2, 3)
print(type(axes))
print(axes.shape)
<class 'numpy.ndarray'> (2, 3)
x_sin = np.linspace(0, 2*np.pi, 100)
x_cos = np.linspace(0.5*np.pi, 2.5*np.pi, 100)
y_sin = np.sin(x_sin)
y_cos = np.cos(x_cos)
fig, axes = plt.subplots(2, 3)
axes[0, 1].plot(x_sin, y_sin)
axes[1, 2].plot(x_cos, y_cos)
plt.show()
Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.
Source: https://seaborn.pydata.org
# Plotting with Seaborn
country_ts_data = confirmed_time_series[confirmed_time_series['Country/Region'].isin(['Taiwan*', 'Singapore', 'New Zealand'])]
fig = sns.lineplot(x="Date", y="Confirmed", hue="Country/Region", data=country_ts_data)
fig.set(xticks=[])
[[]]
confirmed_by_province_state = daily_report.groupby(["Province_State"])['Confirmed'].sum()
confirmed_by_province_state_df = pd.DataFrame(confirmed_by_province_state).reset_index()
confirmed_by_province_state_df
| Province_State | Confirmed | |
|---|---|---|
| 0 | Abruzzo | 388718 |
| 1 | Acre | 124969 |
| 2 | Adygea Republic | 49283 |
| 3 | Aguascalientes | 66137 |
| 4 | Aichi | 500955 |
| ... | ... | ... |
| 591 | Zaporizhia Oblast | 215070 |
| 592 | Zeeland | 168005 |
| 593 | Zhejiang | 3130 |
| 594 | Zhytomyr Oblast | 192801 |
| 595 | Zuid-Holland | 1689703 |
596 rows × 2 columns
# Plotting with Folium
url = "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
state_geo = f"{url}/us-states.json"
m = folium.Map(location=[48, -102], zoom_start=3)
folium.Choropleth(
geo_data=state_geo,
name="choropleth",
data=confirmed_by_province_state_df,
columns=["Province_State", "Confirmed"],
key_on="feature.properties.name",
fill_color="YlGn",
fill_opacity=0.7,
line_opacity=0.2,
legend_name="Confirmed",
).add_to(m)
folium.LayerControl().add_to(m)
<folium.map.LayerControl at 0x7fd7b6c38fd0>
m
# Plotting with Plotly
fig = px.bar(pd.DataFrame(filtered_confirmed_by_country).reset_index().sort_values("Confirmed", ascending=False),
x="Confirmed", y="Country_Region", color="Country_Region")
fig.show()
# Plotting with Plotly
country_ts_data = confirmed_time_series[confirmed_time_series['Country/Region'].isin(country_list)]
fig = px.line(country_ts_data, x="Date", y="Confirmed",
color="Country/Region", line_group="Country/Region", hover_name="Country/Region",
line_shape="spline", render_mode="svg")
fig.show()
# Plotting with Plotly
confirmed_by_lat_long = daily_report.groupby(['Combined_Key', "Lat", "Long_"])['Confirmed'].sum()
confirmed_by_lat_long_df = pd.DataFrame(confirmed_by_lat_long).reset_index()
fig = px.scatter_mapbox(confirmed_by_lat_long_df, lat="Lat", lon="Long_", size="Confirmed", color="Confirmed",
size_max=50, mapbox_style="carto-positron", zoom=0.5, hover_name="Combined_Key")
fig.show()